This is a comprehensive Exploratory Data Analysis for the Web Traffic Time Series Forecasting competition.
import pandas as pd
import numpy as np
from dfply import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model as lm
from plotnine import *
from datetime import datetime
from pyramid.arima import auto_arima
import math
Note, that the key_1.csv data is not small with about 700 MB and for the purpose of this exploration we only read a few
train= pd.read_csv('Files/train_1.csv')
key= pd.read_csv('Files/key_1.csv',nrows=5)
Dimension of train and CSV
train
print(train.shape)
print(key.shape)
print(train.describe)
key
train["Page"]
train['Page'][0:5]
key
sum(train.isnull().sum())/ (train.shape[1]+train.shape[0])
To make the training data easier to handle we split it into two part: the article information (from the Page column) and the time series data (tdates) from the date columns. We briefly separate the article information into data from wikipedia, wikimedia, and mediawiki due to the different formatting of the Page names. After that, we rejoin all article information into a common data set (tpages).
tdates= train.iloc[:,1:]
articles= train["Page"]
mediawiki= articles[articles.str.contains("mediawiki")]
wikimedia = articles[articles.str.contains("wikimedia")]
wikipedia = articles[articles.str.contains("wikipedia")]
wikipedia=wikipedia.to_frame()
mediawiki=mediawiki.to_frame()
wikimedia= wikimedia.to_frame()
def fil(x):
if "mediawiki" not in x or "wikimedia" not in x :
return True
else:
return False
wikipedia["Page"]=wikipedia[wikipedia["Page"].apply(fil)]["Page"]
wikipedia=wikipedia.reset_index(drop=True)
def splitF(x):
try:
a1= x.split(".wikipedia.org_")
return a1[1]
except:
return "None"
wikipedia["foo"]= wikipedia["Page"].apply(lambda x: x.split(".wikipedia.org_")[0])
wikipedia["bar"]=wikipedia["Page"].apply(splitF)
wikipedia
def splitA(x):
try:
a1= x.split("_")
return a1[1]
except:
return "None"
wikipedia["rowname"] = wikipedia.index
wikipedia["article"]=wikipedia["foo"].apply(lambda x: x[0:-3])
wikipedia["locale"]= wikipedia["foo"].apply(lambda x: x[-2:])
wikipedia["access"]=wikipedia["bar"].apply(lambda x: x.split("_")[0])
wikipedia["agent"]=wikipedia["bar"].apply(splitA)
wikipedia.drop(["Page","foo","bar"] ,axis=1, inplace=True)
wikimedia["rowname"] = wikimedia.index
wikimedia["article"]= wikimedia["Page"].apply(lambda x: x.split("_commons.wikimedia.org_")[0])
wikimedia["bar"]=wikimedia["Page"].apply(lambda x: x.split("_commons.wikimedia.org_")[1])
wikimedia["access"]=wikimedia["bar"].apply(lambda x: x.split("_")[0])
wikimedia["agent"]=wikimedia["bar"].apply(lambda x: x.split("_")[1])
wikimedia["locale"]="wikmed"
wikimedia.drop(["Page","bar"],axis=1, inplace=True)
mediawiki["rowname"] = mediawiki.index
mediawiki["article"]= mediawiki["Page"].apply(lambda x: x.split("_www.mediawiki.org_")[0])
mediawiki["bar"]=mediawiki["Page"].apply(lambda x: x.split("_www.mediawiki.org_")[1])
mediawiki["access"]=mediawiki["bar"].apply(lambda x: x.split("_")[0])
mediawiki["agent"]=mediawiki["bar"].apply(lambda x: x.split("_")[1])
mediawiki["locale"]="medwik"
mediawiki.drop(["Page","bar"],axis=1, inplace=True)
wikipedia
frames=[wikipedia,wikimedia,mediawiki]
tpages=pd.concat(frames,ignore_index=True)
In order to plot the time series data we use a helper function that allows us to extract the time series for a specified row number. (The normalised version is to facilitate the coparision between multiple time series curves, to correct for large differences in view count.)
dateView= pd.DataFrame(columns={'dates','views'})
def extract_ts(rownr):
dateView["dates"]=tdates.columns.values
dateView["views"]=tdates.values[rownr,:]
return dateView
dateMeanView= pd.DataFrame(columns={'dates','views'})
def extract_ts_nrm(rownr):
dateMeanView["dates"]=tdates.columns.values
mean= np.mean(tdates.values[rownr,:])
npArray= np.array(tdates.values[rownr,:])/mean
dateMeanView["views"]=npArray
return dateMeanView
A custom-made plotting function allows us to visualise each time series and extract its meta data
def plot_rownr(rownr):
art=tpages['article'][rownr]
loc=tpages['locale'][rownr]
acc=tpages['access'][rownr]
dateView=extract_ts(rownr)
#return dateView
dateView["dates"] = pd.to_datetime(dateView['dates'])
return ggplot(dateView,aes(x='dates',y='views'))+ geom_line()+geom_smooth(color = "blue", span = 1/5)+ stat_smooth(method = "lm") +labs(title = art+"-"+loc+"-"+acc)
This is how it works (to visualise timey-wimey stuff):
plot_rownr(1)
def plot_rownr_log(rownr):
art=tpages['article'][rownr]
loc=tpages['locale'][rownr]
acc=tpages['access'][rownr]
dateView=extract_ts_nrm(rownr)
dateView["dates"] = pd.to_datetime(dateView['dates'])
return ggplot(dateView,aes(x='dates',y='views'))+ geom_line()+geom_smooth(color = "blue", span = 1/5)+ stat_smooth(method = "lm") +labs(title = art+"-"+loc+"-"+acc)
labs(title = art+"-"+loc+"-"+acc)+scale_y_log10() + labs(y = "log views")
plot_rownr_log(1)
def plot_rownr_zoom(rownr, start, end):
art=tpages['article'][rownr]
loc=tpages['locale'][rownr]
acc=tpages['access'][rownr]
dateView=extract_ts_nrm(rownr)
dateView["dates"] = pd.to_datetime(dateView['dates'])
startDate=datetime.strptime(start , '%Y-%m-%d')
endDate=datetime.strptime(end , '%Y-%m-%d')
dateView=dateView[(dateView["dates"]>=startDate) & (dateView["dates"]<=endDate) ]
#return dateView
return ggplot(dateView,aes(x='dates',y='views'))+ geom_line()+geom_smooth(color = "blue", span = 1/5)+ stat_smooth(method = "lm") +labs(title = art+"-"+loc+"-"+acc)
labs(title = art+"-"+loc+"-"+acc)+scale_y_log10() + labs(y = "log views")
plot_rownr_zoom(1,'2015-03-01','2015-09-01')
In addition, with the help of the extractor tool we define a function that re-connects the Page information to the corresponding time series and plots this curve according to our specification on article name, access type, and agent for all the available languages:
def plot_names(art, acc, ag):
pick=tpages[(tpages['article']==art) & (tpages['access']==acc) & (tpages['agent']==ag)]
pick_nr=pick['rowname']
pick_loc=pick['locale']
tdat= extract_ts(pick_nr.values[0])
tdat["loc"]=pick_loc.values[0]
for i in range(1,len(pick)):
foo= extract_ts(pick_nr.values[i])
foo["loc"]=pick_loc.values[i]
tdat=pd.concat([tdat,foo])
tdat["dates"] = pd.to_datetime(tdat['dates'])
plt=ggplot(tdat,aes(x='dates',y='views',color = tdat["loc"]))+geom_line() + labs(title = art+"-"+acc+"-"+ag)
return plt
def plot_names_nrm(art, acc, ag):
pick=tpages[(tpages['article']==art) & (tpages['access']==acc) & (tpages['agent']==ag)]
pick_nr=pick['rowname']
pick_loc=pick['locale']
tdat= extract_ts_nrm(pick_nr.values[0])
tdat["loc"]=pick_loc.values[0]
for i in range(1,len(pick)):
foo= extract_ts(pick_nr.values[i])
foo["loc"]=pick_loc.values[i]
tdat=pd.concat([tdat,foo])
tdat["dates"] = pd.to_datetime(tdat['dates'])
plt=ggplot(tdat,aes(x='dates',y='views',color = tdat["loc"]))+geom_line() + labs(title = art+"-"+acc+"-"+ag)
return plt
plot_names("The_Beatles", "all-access", "all-agents")
These are the tools we need for a visual examinination of arbitrary individual time series data. In the following, we will use them to illustrate specific observations that are of particular interest.
In the next step we will have a more global look at the population parameters of our training time series data. Also here, we will start with the wikipedia data. The idea behind this approach is to probe the parameter space of the time series information along certain key metrics and to identify extreme observations that could break our forecasting strategies.
Before diving into the time series data let’s have a look how the different meta-parameters are distributed:
ggplot(tpages, aes(x='agent')) + geom_bar(fill = "red")
ggplot(tpages, aes(x='access')) + geom_bar(fill = "red")
ggplot(tpages, aes(x='locale', fill=tpages['locale'])) + geom_bar()
We find that our wikipedia data includes 7 languages: German, English, Spanish, French, Japanese, Russian, and Chinese. All of those are more frequent than the mediawiki and wikimedia pages. Mobile sites are slightly more frequent than desktop ones
We start with a basic set of parameters: mean, standard deviation, amplitude, and a the slope of a naive linear fit. This is our extraction function:
param= pd.DataFrame(columns={'rowname','slope','min_view','max_view','mean_view','med_view','sd_view'})
def params_ts1(rownr):
dateView=extract_ts(rownr)
dateView["dates"] = pd.to_datetime(dateView['dates'])
y=dateView["views"]
x= dateView[["dates"]]
model= lm.LinearRegression()
results=model.fit(x,y)
param['rowname'] = rownr
param['slope']= model.coef_
param['min_view'] = np.min(dateView["views"])
param['max_view'] = np.max(dateView["views"])
param['mean_view'] = np.mean(dateView["views"])
param['med_view'] = np.median(dateView["views"])
param['sd_view'] = np.std(dateView["views"])
return param
And here we run it. (Note, that in this kernel version I’m currently using a sub-sample of the data for reasons of runtime. My extractor function is not very elegant, yet, and exceeds the kernel runtime for the complete data set.)
x=np.random.choice(tpages["rowname"], size=5500)
x.astype(object)
joinedParam= pd.DataFrame(columns={'rowname','slope','min_view','max_view','mean_view','med_view','sd_view'})
for i in x:
dateView=extract_ts(i)
dateView["dates"] = pd.to_datetime(dateView['dates'])
if not dateView["views"].isnull().values.any():
joinedParam=pd.concat([joinedParam,params_ts1(i)])
joinedParam.index=joinedParam["rowname"]
Let’s explore the parameter space we’ve built. (The global shape of the distributions should not be affected by the sampling.) First we plot the histograms of our main parameters:
ggplot(joinedParam, aes(x='mean_view'))+geom_histogram(fill = "red", bins = 50) + scale_x_log10()
ggplot(joinedParam, aes(x='med_view'))+geom_histogram(fill = "red", bins = 50) + scale_x_log10()
difsdmean= pd.DataFrame(columns=["sd/mean"])
difsdmean["sd/mean"]=np.array(joinedParam["sd_view"])/np.array(joinedParam["mean_view"])
ggplot(difsdmean, aes(x='sd/mean'))+geom_histogram(fill = "red", bins = 50) + scale_x_log10()
ggplot(joinedParam, aes(x='slope'))+geom_histogram(fill = "red", bins = 50) + scale_x_log10()
We find:
The distribution of average views is clearly bimodal, with peaks around 10 and 200-300 views. Something similar is true for the number of maximum views, although here the first peak (around 200) is curiuosly narrow. The second peak is centred above 10,000.
The distribution of standard deviations (divided by the mean) is skewed toward higher values with larger numbers of spikes or stronger variability trends. Those will be the observations that are more challenging to forecast.
The slope distribution is resonably symmetric and centred notably above zero.
#par_page= tpages.join(joinedParam,on="rowname",lsuffix="l",rsuffix="r")
#par_page
par_page= tpages.merge(joinedParam,on="rowname")
par_page
Let’s split it up by locale and focus on the densities:
ggplot(par_page, aes(x='mean_view', fill='locale'))+ geom_density(position = "stack") +scale_x_log10(limits = [1,1e4])
ggplot(par_page, aes(x='max_view', fill='locale'))+ geom_density(position = "stack") +scale_x_log10(limits = [1,1e4])
ggplot(par_page, aes(x='slope', fill='locale'))+ geom_density(position = "stack")
ggplot(par_page, aes(x='sd_view', fill='locale'))+ geom_density(position = "stack") +scale_x_log10(limits = [1,1e4])
twoDgraph= pd.DataFrame(columns=["max-mean","mean_view"])
twoDgraph["max-mean"]= np.array(joinedParam["max_view"])-np.array(joinedParam["mean_view"])
twoDgraph["mean_view"]= joinedParam["mean_view"].values
We find:
The chinese pages (zh, in pink) are slightly but notably different from the rest. The have lower mean and max views and also less variation. Their slope distribution is broader, but also shifted more towards positive values compared to the other curves.
The peak in max views around 200-300 is most pronounced in the french pages (fr, in turquoise).
The english pages (en, in mustard) have the highest mean and maximum views, which is not surprising.
Next, we will examine binned 2-d histograms.
ggplot(twoDgraph,aes(x="max-mean",y="mean_view"))+geom_bin2d(bins = [50,50]) +scale_x_log10() +scale_y_log10() + labs(x = "maximum views above mean", y = "mean views")
We find:
twoDgraph.size
limx = [max(joinedParam["max_view"]/50), max(joinedParam["max_view"])]
limy = [max(joinedParam["mean_view"]/50), max(joinedParam["mean_view"])]
ggplot(twoDgraph,aes(x="max-mean",y="mean_view"))+ geom_point(size = 5, color = "red", alpha=0.3) +scale_x_log10(limits=limx) +scale_y_log10(limits=limy) + labs(x = "maximum views above mean", y = "mean views")
Here we find a number of main pages and other meta pages (in the full data set).
Another question: Does the (assumed) linear change in views depend on the total number of views?
ggplot(joinedParam,aes(x='slope',y='mean_view'))+geom_point(color = "red", alpha = 0.1) +scale_y_log10()+scale_x_log10() +labs(x = "linear slope relative to slope error", y = "mean views")
We find that articles with higher average view-count have more variability in their linear trends. However, this might be due to our slope normalisation which will decrease the effective slope for low view counts. It should not, however, affect the observation that the slopes of low-view articles are on average slightly higher than those of high-view articles. Such an effect could be caused by viewing spikes, of course, but I would expect those to be randomly distributed.
Based on the overview parameters we can focus our attention on those articles for which the time series parameters are at the extremes of the parameter space.
Those are the observations with the highest slope values. (In the sample this will be different, but in the full wikipedia data set the top 10 have rownames 91728, 55587, 108341, 70772, 95367, 18357, 95229, 116150, 94975, 77292).
slopesort=joinedParam.sort_values(by='slope',ascending=False).head(n=5)
Let’s have a look at the time series data of the top 4 articles:
plot_rownr(int(slopesort['rowname'].values[0]))
plot_rownr(int(slopesort['rowname'].values[1]))
plot_rownr(int(slopesort['rowname'].values[2]))
plot_rownr(int(slopesort['rowname'].values[3]))
plot_rownr(int(slopesort['rowname'].values[4]))
We find:
Lot’s of love for Twenty One Pilots in Spain. Those rapid rises and wibbly-wobbly bits are going to be difficult to predict, unless there’s a periodic modulation on top of the large-scale trend. Certaintly worth figuring out.
We also see that our generic loess smoother is dealing rather well with most of the slower variability patterns and could be used to remove the low-frequency structures for further analysis.
Let’s compare the interest in Twenty One Pilots for the different countries, to see whether a prediction for one of them could learn from the others:
plot_names_nrm("Twenty_One_Pilots", "all-access", "all-agents")
Note, that those curves are normalised to mean views (each) and have a logarithmic y-axis to mitigate the effect of large spikes. This chart is for relative trend comparison.
We find:
Germany and France show quite similar viewing behaviour, while Russia and Spain are comparable too; especially in the early rise in interest. The English pages show less dramatic changes but end up
With a purely time-series-forecast approach I think that the large spikes are close to impossible to predict. However, external data could help a lot here.
Those viewing numbers were going up, but which articles were going down? (Top 10: 95856, 74115, 8388, 103659, 100213, 9633, 102481 38458, 30042, 74002
#Article going down
articleAscending=joinedParam.sort_values(by='slope',ascending=True).head(n=5)
articleAscending
plot_rownr(int(articleAscending['rowname'].values[0]))
plot_rownr(int(articleAscending['rowname'].values[1]))
plot_rownr(int(articleAscending['rowname'].values[2]))
plot_rownr(int(articleAscending['rowname'].values[3]))
The main page itself on mobile, and review articles on 2015 were the biggest losers.
The top 10 wikipedia rows are 9775, 38574, 103124, 99323, 74115, 39181, 10404, 33645, 34258, and 26994. Bingo, anyone?
# 4.2 High standard deviations
joinedParam['sd_div_mean']= joinedParam['sd_view']/joinedParam['mean_view']
sddivsort=joinedParam.sort_values(by='sd_div_mean',ascending=True).head(n=5)
#plot 4 graphs
plot_rownr(int(sddivsort['rowname'].values[0]))
plot_rownr(int(sddivsort['rowname'].values[1]))
plot_rownr(int(sddivsort['rowname'].values[2]))
plot_rownr(int(sddivsort['rowname'].values[3]))
Those are pretty strong spikes in the main page views, even if the baseline is around 1-10 million to begin with. They look consistent though over different languages. Any ideas what could cause this?
If we normalise standard deviation by mean we get a different set of results:
plot_rownr(10032)
plot_rownr(38812)
plot_rownr(86905)
plot_rownr(102521)
Those are very, very suspicious. They are essentially low baselines with single dates that have way higher view counts (e.g. around 20 vs 2 million for the upper left one). These have to be errors in the data which can be dangerous for predictions if they appear close to either end of the date window. In other cases, most smoothing methods should be able to deal with them.
The top amplitudes are the same as the top standard deviations, due to the spikey nature of the variability:
#Large variability amplitudes
joinedParam['maxView_meanView']= joinedParam['max_view']-joinedParam['mean_view']
maxView_meanViewSort=joinedParam.sort_values(by='maxView_meanView',ascending=False).head(n=5)
plot_rownr(int(sddivsort['rowname'].values[0]))
plot_rownr(int(sddivsort['rowname'].values[1]))
plot_rownr(int(sddivsort['rowname'].values[2]))
plot_rownr(int(sddivsort['rowname'].values[3]))
Those are the time series of the most popular pages, which we already identified as the main pages in the plots above:
meandescSort=joinedParam.sort_values(by='mean_view',ascending=False).head(n=5)
plot_rownr(int(meandescSort['rowname'].values[0]))
plot_rownr(int(meandescSort['rowname'].values[1]))
plot_rownr(int(meandescSort['rowname'].values[2]))
plot_rownr(int(meandescSort['rowname'].values[3]))
In addition to the spikes on the english main page there is a suprising amount of variability as exemplified by the long-term structure in the German main page.
What about other main pages, as identified in the zoom-in above?
plot_rownr_log(int(meandescSort['rowname'].values[0]))
plot_rownr_log(int(meandescSort['rowname'].values[1]))
plot_rownr_log(int(meandescSort['rowname'].values[2]))
plot_rownr_log(int(meandescSort['rowname'].values[3]))
Here 3 of the 4 plots have a logarithmic y-axis to improve the clarity of visualising the time series’ with strong spikes. We see that also those popular pages exhibit strong variability on various time scales.
In summary: We have identified the time series’ with the highest variability according to basic criteria. We also found a few time series sets with bogus values. These are the data sets that might pose the greatest challenge to our prediction algorithms.
Before turning to forecasting methods, let’s have a closer look at the characteristic short-term variability that has become evident in several of the plots already. Below, we plot a 2-months zoom into the “quiet” parts (i.e. no strong spikes) of different time series:
plot_rownr_zoom(10404, "2016-10-01", "2016-12-01")
plot_rownr_zoom(9775, "2015-09-01", "2015-11-01")
plot_rownr_zoom(139120, "2016-10-01", "2016-12-01")
plot_rownr_zoom(110658, "2016-07-01", "2016-09-01")
We see that the high-view-count time series on the left hand side show a very regular periodicity that is strikingly similar for both of them. A similar structure can be seen on the right hand side, although here it is partly distorted by a slight upward trend (upper right) and/or variance caused by lower viewing numbers (lower right).
These plots provide evidence that there is variability on a weekly scale. The next figure will visualise this weekly behaviour in a different way:
rownr=10404
start = "2016-10-01"
end = "2016-12-01"
dateView1=extract_ts_nrm(rownr)
dateView1["dates"] = pd.to_datetime(dateView1['dates'])
startDate=datetime.strptime(start , '%Y-%m-%d')
endDate=datetime.strptime(end , '%Y-%m-%d')
dateView1=dateView1[(dateView1["dates"]>=startDate) & (dateView1["dates"]<=endDate)]
dateView1["wday_views"]=dateView1["views"].mean()
dateView1["wday_views"]=dateView1["wday_views"]/np.mean(dateView1["wday_views"])
dateView1["rowname"]= dateView1.index
rownr=9775
start ="2015-09-01"
end = "2015-11-01"
dateView2=extract_ts_nrm(rownr)
dateView2["dates"] = pd.to_datetime(dateView2['dates'])
startDate=datetime.strptime(start , '%Y-%m-%d')
endDate=datetime.strptime(end , '%Y-%m-%d')
dateView2=dateView2[(dateView2["dates"]>=startDate) & (dateView2["dates"]<=endDate)]
dateView2["wday_views"]=dateView2["views"].mean()
dateView2["wday_views"]=dateView2["wday_views"]/np.mean(dateView2["wday_views"])
dateView2["rowname"]= dateView2.index
rownr=110658
start ="2016-07-01"
end = "2016-09-01"
dateView3=extract_ts_nrm(rownr)
dateView3["dates"] = pd.to_datetime(dateView3['dates'])
startDate=datetime.strptime(start , '%Y-%m-%d')
endDate=datetime.strptime(end , '%Y-%m-%d')
dateView3=dateView3[(dateView3["dates"]>=startDate) & (dateView3["dates"]<=endDate)]
dateView3["wday_views"]=dateView3["views"].mean()
dateView3["wday_views"]=dateView3["wday_views"]/np.mean(dateView3["wday_views"])
dateView3["rowname"]= dateView3.index
m=pd.concat([dateView1,dateView2,dateView3])
ggplot(m,aes("dates", "wday_views",color = "rowname")) +geom_jitter(size = 4, width = 0.1) +labs(x = "Day of the week", y = "Relative average views")
In this plot, a period of 7 days is indicated by the vertical blue line.
As expected, we find that all four data sets share a strong signal at a period of 1 week. This is particularly evident in the clean time series’ with high median views, but also still in the article with lower views. In our following analysis we can therefore reasonably assume that a period of 7 days is present in all our articles.
Now that we have identified a sample of time series’ with extreme parameters we can use them to test different forecasting methods. For a sample of 145k articles we will most likely have to rely on an automatic mechanism to make our predictions (although a degree of fine-tuning might be possible). Therefore, our forecasting method will have to perform robustly for a range of different time series shapes and varibilities. Those methods that manage to deal with our extreme examples should be able to deal with any less variable time series as well.
For this competition our forecast period is 2 monts, i.e. about 60 days. In the following, we simulate this period and assess our prediction accuracy by keeping a hold-out sample of the last 60 days from our forecasting data. After making the prediction we can compare the actual view counts to the forecasted ones.
A popular approach in time series forecasting is to use an autoregressive integrated moving average model; short ARIMA model. This kind of model consists of three parts, parametrised by indeces p, d, q as ARIMA(p, d, q):
auto-regressive / p: we are using past data to compute a regression model for future data. The parameter p indicates the range of lags; e.g. ARIMA(3,0,0) includes t-1, t-2, and t-3 values in the regression to compute the value at t.
integrated / d: this is a differencing parameter, which gives us the number of times we are subtracting the current and the previous values of a time series. Differencing removes the change in a time series in that it stabilises the mean and removes (seasonal) trends. This is necessary since computing the lags (e.g. difference between time t and time t-1) is most meaningful if large-scale trends are removed. A time series where the variance (or amount of variability) (and the autocovariance) are time-invariant (i.e. don’t change from day to day) is called stationary.
moving average / q: this parameter gives us the number of previous error terms to include in the regression error of the model.
Using our insight about the weekly periodicity, we can directly incorporate this frequency when turning our view counts into a time series object (using the ts function). Note, that we also perform some cleaning and outlier rejection using the tsclean tool. As usual, we wrap the modelling and plotting process into a function and then apply it to four time series sets that we know from our previous analysis:
pre_views_arima= pd.DataFrame(columns={'dates','views'})
post_views_arima= pd.DataFrame(columns={'dates','views'})
def plot_auto_arima_rownr(rownr):
pageviews_arima= extract_ts(rownr)
pred_len=60
pred_range=[(pageviews_arima.shape[0]-pred_len+1),pageviews_arima.shape[0]]
pre_views_arima=pageviews_arima.head(pageviews_arima.shape[0]-pred_len)
post_views_arima= pageviews_arima.tail(pred_len)
stepwise_fit = auto_arima(pre_views_arima["views"], start_p=1, start_q=1,
max_p=3, max_q=3, m=12,
start_P=0,
d=1, D=1, trace=True,
error_action='ignore', # don't want to know if an order does not work
suppress_warnings=True) # set to stepwise
fc_views =stepwise_fit.predict(n_periods=pred_len)
plt.plot(pre_views_arima["views"])
plt.plot(fc_views,c="red")
plot_auto_arima_rownr(1)
plot_auto_arima_rownr(95856)
plot_auto_arima_rownr(108341)
The results are not too bad, actually. Especially the lower left plot. We even got a downturn in the upper left plot. The upper right plot is a challenging problem, because the levelling of the viewer numbers at the end of the time range was not predictable from the previous behaviour. The same is true for the large spike in the lower right plot.
Given that it’s a fully automatic forecast (assuming only weekly periodicities) the auto.arima tool performs decently and provides us with a useful baseline to compare other methods to.
Prophet is an open-source time series forecasting tool developed by Facebook. It is implemented in an R library, and also a Python package (as already shown in this competition).
Prophet works as an additive regression model which decomposes a time series into (i) a (piecewise) linear/logistic trend, (ii) a yearly seasonal component, (iii) a weekly seasonal component, and (iv) an optional list of important days (such as holidays, special events, …). It claims to be “robust to missing data, shifts in the trend, and large outliers”, which would make it well suited for this particular task
First, let’s test the tool:
dateView= pd.DataFrame(columns={'dates','views'})
rownr=139120
pageviews= extract_ts(rownr)
pageviews["dates"] = pd.to_datetime(pageviews['dates'])
pageviews.rename(columns={'dates':'ds',
'views':'y'}, inplace=True)
pred_len=60
pred_range=[(pageviews.shape[0]-pred_len+1),pageviews.shape[0]]
pre_views=pageviews.head(pageviews.shape[0]-pred_len)
post_views= pageviews.tail(pred_len)
from fbprophet import Prophet
A few notes about the practical workings of prophet:
data format: prophet expects a data frame with two columns: ds, y. The first one holds the dates, the second one the time series counts.
parameter changepoint.prior.scale adjusts the trend flexibility. Increasing this parameter makes the fit more flexible, but also increases the forecast uncertainties and makes it more likely to overfit to noise. The changepoints in the data are automatically detected unless being specified by hand using the changepoints argument (which we don’t do here).
parameter yearly.seasonality=TRUE has to be enabled explicitely and allows prophet to notice large-scale cycles. The importance of this parameter is explored further below.
This is the standard prophet forecast plot:
m = Prophet()
m.fit(pageviews)
future = m.make_future_dataframe(periods=pred_len)
future.tail()
forecast = m.predict(future)
m.plot(forecast)
The observed data are plotted as black points and the fitted model, plus forecast, as a blue line. In light blue we see the corresponding uncertainties.
Prophet offers a decomposition plot, where we can inspect the additive components of the model: trend, yearly seasonality, and weekly cycles:
fig2 = m.plot_components(forecast)
We see that prophet recovers the weekly variation pattern we had extracted by hand in the previous section. This is a useful consistency check. The seasonal variability suggests an overall decline in views towards the middle of the year.
Being the ggplot2 freaks that we are, we decide to visualise our forecast in a different way that gives us more control over the output:
ggplot(forecast,aes("ds", "yhat")) + geom_ribbon(aes(x = "ds", ymin = "yhat_lower", ymax = "yhat_upper"), fill = "lightblue") + geom_line(colour = "#ADD8E6") +geom_line(pre_views, aes("ds", "y"), colour = "black") +geom_line(post_views, aes("ds", "y"), colour = "grey")
Here we plot the observed data as black line, our hold-out set as a grey line, and the forecast plus uncertainties in blue and light blue, again. This shows us immediately how our model is performing, and in this case it’s not doing badly.
We turn this ggplot2 version into a plotting function and use it to predict a couple of sample time series. We also include the seasonality parameter (TRUE/FALSE) as a second input:
def plot_prophet_rownr_season(rownr, season):
art=tpages['article'][rownr]
loc=tpages['locale'][rownr]
acc=tpages['access'][rownr]
pageviews= extract_ts(rownr)
pageviews["dates"] = pd.to_datetime(pageviews['dates'])
pageviews.rename(columns={'dates':'DS',
'views':'Y'}, inplace=True)
pred_len=60
pred_range=[(pageviews.shape[0]-pred_len+1),pageviews.shape[0]]
pre_views=pageviews.head(pageviews.shape[0]-pred_len)
post_views= pageviews.tail(pred_len)
m = Prophet(changepoint_prior_scale=0.5, yearly_seasonality=season)
m.fit(pageviews)
future = m.make_future_dataframe(periods=pred_len)
future.tail()
forecast = m.predict(future)
return ggplot(forecast,aes("ds", "yhat")) + geom_ribbon(aes(x = "ds", ymin = "yhat_lower", ymax = "yhat_upper"), fill = "lightblue") + geom_line(colour = "#ADD8E6") +geom_line(pre_views, aes("ds", "y"), colour = "black") +geom_line(post_views, aes("ds", "y"), colour = "grey")
plot_prophet_rownr_season(70772, False)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
plot_prophet_rownr_season(108341, True)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
plot_prophet_rownr_season(95856, True)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
plot_prophet_rownr_season(139120, True)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Enabling prophet to recognise long-term seasonal variations in the data is crucial for a successful forecasting of our time series data. To demonstrate this, below I’m plotting the following two sample curves: the German main page and the entry for Oxygen in the Spanish wikipedia (many thanks to MuonNeutrino for flagging this time series in their great exploratory kernel):
plot_prophet_rownr_season(72480, False)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Initial log joint probability = -12.7547 Optimization terminated normally: Convergence detected: absolute parameter change was below tolerance
plot_prophet_rownr_season(72480, True)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
Initial log joint probability = -12.7547 Optimization terminated normally: Convergence detected: relative gradient magnitude is below tolerance
plot_prophet_rownr_season(139120, False)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this. Initial log joint probability = -2.84109 Optimization terminated normally: Convergence detected: relative gradient magnitude is below tolerance
plot_prophet_rownr_season(139120, True)
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this. Initial log joint probability = -2.84109 Optimization terminated normally: Convergence detected: relative gradient magnitude is below tolerance
The upper row of plot shows forecasts without a seasonal component vs the presence of this component in the lower row. We can clearly see that the seasonal forecasts predict the real time series evolution much better than the others. A seasonal component should be included in a successful prophet model for this project.
Thanks for reading this exploration! I’m grateful for all the upvotes and the great feedback.
Have fun!